{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature: WordNet Similarity"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compute the aggregate similarity of two question token sets according to ontological graph distances in WordNet.\n",
    "\n",
    "Based on the [implementation of the paper \"Sentence Similarity based on Semantic Nets and Corpus Statistics\" by Li et al.](https://github.com/sujitpal/nltk-examples/blob/master/src/semantic/short_sentence_similarity.py)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automatically discover the paths to various data folders and compose the project structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Identifier for storing these features on disk and referring to them later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_list_id = 'wordnet_similarity'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download auxiliary NLTK models and corpora."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package brown to /Users/xshuai/nltk_data...\n",
      "[nltk_data]   Package brown is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to /Users/xshuai/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n",
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     /Users/xshuai/nltk_data...\n",
      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
      "[nltk_data]       date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download('brown')\n",
    "nltk.download('wordnet')\n",
    "nltk.download('averaged_perceptron_tagger')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Preprocessed and tokenized questions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_train.pickle')\n",
    "tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_test.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens = tokens_train + tokens_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "brown_freqs = dict()\n",
    "N = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_batch_similarities(pairs, _):\n",
    "    from nltk.corpus import brown\n",
    "    from nltk.corpus import wordnet as wn\n",
    "\n",
    "    # Parameters to the algorithm. Currently set to values that was reported\n",
    "    # in the paper to produce \"best\" results.\n",
    "    ALPHA = 0.2\n",
    "    BETA = 0.45\n",
    "    ETA = 0.4\n",
    "    PHI = 0.2\n",
    "    DELTA = 0.85\n",
    "\n",
    "    ######################### word similarity ##########################\n",
    "\n",
    "    def get_best_synset_pair(word_1, word_2):\n",
    "        \"\"\" \n",
    "        Choose the pair with highest path similarity among all pairs. \n",
    "        Mimics pattern-seeking behavior of humans.\n",
    "        \"\"\"\n",
    "        max_sim = -1.0\n",
    "        synsets_1 = wn.synsets(word_1)\n",
    "        synsets_2 = wn.synsets(word_2)\n",
    "        if len(synsets_1) == 0 or len(synsets_2) == 0:\n",
    "            return None, None\n",
    "        else:\n",
    "            max_sim = -1.0\n",
    "            best_pair = None, None\n",
    "            for synset_1 in synsets_1:\n",
    "                for synset_2 in synsets_2:\n",
    "                    sim = wn.path_similarity(synset_1, synset_2)\n",
    "                    if sim is not None and sim > max_sim:\n",
    "                        max_sim = sim\n",
    "                        best_pair = synset_1, synset_2\n",
    "            return best_pair\n",
    "\n",
    "    def length_dist(synset_1, synset_2):\n",
    "        \"\"\"\n",
    "        Return a measure of the length of the shortest path in the semantic \n",
    "        ontology (Wordnet in our case as well as the paper's) between two \n",
    "        synsets.\n",
    "        \"\"\"\n",
    "        l_dist = 1e9\n",
    "        if synset_1 is None or synset_2 is None: \n",
    "            return 0.0\n",
    "        if synset_1 == synset_2:\n",
    "            # if synset_1 and synset_2 are the same synset return 0\n",
    "            l_dist = 0.0\n",
    "        else:\n",
    "            wset_1 = set([str(x.name()) for x in synset_1.lemmas()])        \n",
    "            wset_2 = set([str(x.name()) for x in synset_2.lemmas()])\n",
    "            if len(wset_1.intersection(wset_2)) > 0:\n",
    "                # if synset_1 != synset_2 but there is word overlap, return 1.0\n",
    "                l_dist = 1.0\n",
    "            else:\n",
    "                # just compute the shortest path between the two\n",
    "                l_dist = synset_1.shortest_path_distance(synset_2)\n",
    "                if l_dist is None:\n",
    "                    l_dist = 0.0\n",
    "        # normalize path length to the range [0,1]\n",
    "        return math.exp(-ALPHA * l_dist)\n",
    "\n",
    "    def hierarchy_dist(synset_1, synset_2):\n",
    "        \"\"\"\n",
    "        Return a measure of depth in the ontology to model the fact that \n",
    "        nodes closer to the root are broader and have less semantic similarity\n",
    "        than nodes further away from the root.\n",
    "        \"\"\"\n",
    "        h_dist = 1e9\n",
    "        if synset_1 is None or synset_2 is None: \n",
    "            return h_dist\n",
    "        if synset_1 == synset_2:\n",
    "            # return the depth of one of synset_1 or synset_2\n",
    "            h_dist = max([x[1] for x in synset_1.hypernym_distances()])\n",
    "        else:\n",
    "            # find the max depth of least common subsumer\n",
    "            hypernyms_1 = {x[0]:x[1] for x in synset_1.hypernym_distances()}\n",
    "            hypernyms_2 = {x[0]:x[1] for x in synset_2.hypernym_distances()}\n",
    "            lcs_candidates = set(hypernyms_1.keys()).intersection(\n",
    "                set(hypernyms_2.keys()))\n",
    "            if len(lcs_candidates) > 0:\n",
    "                lcs_dists = []\n",
    "                for lcs_candidate in lcs_candidates:\n",
    "                    lcs_d1 = 0\n",
    "                    if lcs_candidate in hypernyms_1:\n",
    "                        lcs_d1 = hypernyms_1[lcs_candidate]\n",
    "                    lcs_d2 = 0\n",
    "                    if lcs_candidate in hypernyms_2:\n",
    "                        lcs_d2 = hypernyms_2[lcs_candidate]\n",
    "                    lcs_dists.append(max([lcs_d1, lcs_d2]))\n",
    "                h_dist = max(lcs_dists)\n",
    "            else:\n",
    "                h_dist = 0\n",
    "        return ((math.exp(BETA * h_dist) - math.exp(-BETA * h_dist)) / \n",
    "            (math.exp(BETA * h_dist) + math.exp(-BETA * h_dist)))\n",
    "\n",
    "    def word_similarity(word_1, word_2):\n",
    "        synset_pair = get_best_synset_pair(word_1, word_2)\n",
    "        return (length_dist(synset_pair[0], synset_pair[1]) * \n",
    "            hierarchy_dist(synset_pair[0], synset_pair[1]))\n",
    "\n",
    "    ######################### sentence similarity ##########################\n",
    "\n",
    "    def most_similar_word(word, word_set):\n",
    "        \"\"\"\n",
    "        Find the word in the joint word set that is most similar to the word\n",
    "        passed in. We use the algorithm above to compute word similarity between\n",
    "        the word and each word in the joint word set, and return the most similar\n",
    "        word and the actual similarity value.\n",
    "        \"\"\"\n",
    "        max_sim = -1.0\n",
    "        sim_word = \"\"\n",
    "        for ref_word in word_set:\n",
    "            sim = word_similarity(word, ref_word)\n",
    "            if sim > max_sim:\n",
    "                max_sim = sim\n",
    "                sim_word = ref_word\n",
    "        return sim_word, max_sim\n",
    "\n",
    "    def info_content(lookup_word):\n",
    "        \"\"\"\n",
    "        Uses the Brown corpus available in NLTK to calculate a Laplace\n",
    "        smoothed frequency distribution of words, then uses this information\n",
    "        to compute the information content of the lookup_word.\n",
    "        \"\"\"\n",
    "        global N\n",
    "        if N == 0:\n",
    "            # poor man's lazy evaluation\n",
    "            for sent in brown.sents():\n",
    "                for word in sent:\n",
    "                    word = word.lower()\n",
    "                    if word not in brown_freqs:\n",
    "                        brown_freqs[word] = 0\n",
    "                    brown_freqs[word] = brown_freqs[word] + 1\n",
    "                    N = N + 1\n",
    "        lookup_word = lookup_word.lower()\n",
    "        n = 0 if lookup_word not in brown_freqs else brown_freqs[lookup_word]\n",
    "        return 1.0 - (math.log(n + 1) / math.log(N + 1))\n",
    "\n",
    "    def semantic_vector(words, joint_words, info_content_norm):\n",
    "        \"\"\"\n",
    "        Computes the semantic vector of a sentence. The sentence is passed in as\n",
    "        a collection of words. The size of the semantic vector is the same as the\n",
    "        size of the joint word set. The elements are 1 if a word in the sentence\n",
    "        already exists in the joint word set, or the similarity of the word to the\n",
    "        most similar word in the joint word set if it doesn't. Both values are \n",
    "        further normalized by the word's (and similar word's) information content\n",
    "        if info_content_norm is True.\n",
    "        \"\"\"\n",
    "        sent_set = set(words)\n",
    "        semvec = np.zeros(len(joint_words))\n",
    "        i = 0\n",
    "        for joint_word in joint_words:\n",
    "            if joint_word in sent_set:\n",
    "                # if word in union exists in the sentence, s(i) = 1 (unnormalized)\n",
    "                semvec[i] = 1.0\n",
    "                if info_content_norm:\n",
    "                    semvec[i] = semvec[i] * math.pow(info_content(joint_word), 2)\n",
    "            else:\n",
    "                # find the most similar word in the joint set and set the sim value\n",
    "                sim_word, max_sim = most_similar_word(joint_word, sent_set)\n",
    "                semvec[i] = PHI if max_sim > PHI else 0.0\n",
    "                if info_content_norm:\n",
    "                    semvec[i] = semvec[i] * info_content(joint_word) * info_content(sim_word)\n",
    "            i = i + 1\n",
    "        return semvec                \n",
    "\n",
    "    def semantic_similarity(words_1, words_2, info_content_norm):\n",
    "        \"\"\"\n",
    "        Computes the semantic similarity between two sentences as the cosine\n",
    "        similarity between the semantic vectors computed for each sentence.\n",
    "        \"\"\"\n",
    "        joint_words = set(words_1).union(set(words_2))\n",
    "        vec_1 = semantic_vector(words_1, joint_words, info_content_norm)\n",
    "        vec_2 = semantic_vector(words_2, joint_words, info_content_norm)\n",
    "        return np.dot(vec_1, vec_2.T) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))\n",
    "\n",
    "    ######################### word order similarity ##########################\n",
    "\n",
    "    def word_order_vector(words, joint_words, windex):\n",
    "        \"\"\"\n",
    "        Computes the word order vector for a sentence. The sentence is passed\n",
    "        in as a collection of words. The size of the word order vector is the\n",
    "        same as the size of the joint word set. The elements of the word order\n",
    "        vector are the position mapping (from the windex dictionary) of the \n",
    "        word in the joint set if the word exists in the sentence. If the word\n",
    "        does not exist in the sentence, then the value of the element is the \n",
    "        position of the most similar word in the sentence as long as the similarity\n",
    "        is above the threshold ETA.\n",
    "        \"\"\"\n",
    "        wovec = np.zeros(len(joint_words))\n",
    "        i = 0\n",
    "        wordset = set(words)\n",
    "        for joint_word in joint_words:\n",
    "            if joint_word in wordset:\n",
    "                # word in joint_words found in sentence, just populate the index\n",
    "                wovec[i] = windex[joint_word]\n",
    "            else:\n",
    "                # word not in joint_words, find most similar word and populate\n",
    "                # word_vector with the thresholded similarity\n",
    "                sim_word, max_sim = most_similar_word(joint_word, wordset)\n",
    "                if max_sim > ETA:\n",
    "                    wovec[i] = windex[sim_word]\n",
    "                else:\n",
    "                    wovec[i] = 0\n",
    "            i = i + 1\n",
    "        return wovec\n",
    "\n",
    "    def word_order_similarity(words_1, words_2):\n",
    "        \"\"\"\n",
    "        Computes the word-order similarity between two sentences as the normalized\n",
    "        difference of word order between the two sentences.\n",
    "        \"\"\"\n",
    "        joint_words = list(set(words_1).union(set(words_2)))\n",
    "        windex = {x[1]: x[0] for x in enumerate(joint_words)}\n",
    "        r1 = word_order_vector(words_1, joint_words, windex)\n",
    "        r2 = word_order_vector(words_2, joint_words, windex)\n",
    "        return 1.0 - (np.linalg.norm(r1 - r2) / np.linalg.norm(r1 + r2))\n",
    "\n",
    "    ######################### overall similarity ##########################\n",
    "\n",
    "    def similarity(words_1, words_2, info_content_norm):\n",
    "        \"\"\"\n",
    "        Calculate the semantic similarity between two sentences. The last \n",
    "        parameter is True or False depending on whether information content\n",
    "        normalization is desired or not.\n",
    "        \"\"\"\n",
    "        return DELTA * semantic_similarity(words_1, words_2, info_content_norm) + \\\n",
    "            (1.0 - DELTA) * word_order_similarity(words_1, words_2)\n",
    "\n",
    "    ######################### main / test ##########################\n",
    "\n",
    "    return [\n",
    "        [similarity(pair[0], pair[1], False), similarity(pair[0], pair[1], True)]\n",
    "        for pair in pairs\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches:   0%|          | 8/2751 [03:47<21:38:52, 28.41s/it]"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-11-0dc86a3d20d7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0mtokens\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0mbatch_mapper\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mget_batch_similarities\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m     \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m )\n",
      "\u001b[0;32m/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pygoose/kg/jobs.py\u001b[0m in \u001b[0;36mmap_batch_parallel\u001b[0;34m(input_list, batch_size, item_mapper, batch_mapper, flatten, n_jobs, **kwargs)\u001b[0m\n\u001b[1;32m    139\u001b[0m             \u001b[0mdesc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Batches'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    140\u001b[0m             \u001b[0mtotal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 141\u001b[0;31m             \u001b[0mfile\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    142\u001b[0m         )\n\u001b[1;32m    143\u001b[0m     )\n",
      "\u001b[0;32m/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m    928\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    929\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieval_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 930\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    931\u001b[0m             \u001b[0;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    932\u001b[0m             \u001b[0melapsed_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    831\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    832\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'supports_timeout'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 833\u001b[0;31m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    834\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    835\u001b[0m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mwrap_future_result\u001b[0;34m(future, timeout)\u001b[0m\n\u001b[1;32m    519\u001b[0m         AsyncResults.get from multiprocessing.\"\"\"\n\u001b[1;32m    520\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    522\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mLokyTimeoutError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    523\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/concurrent/futures/_base.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    425\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__get_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    426\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_condition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    428\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    429\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mCANCELLED\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCANCELLED_AND_NOTIFIED\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    293\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m    \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    294\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 295\u001b[0;31m                 \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    296\u001b[0m                 \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    297\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "similarities = kg.jobs.map_batch_parallel(\n",
    "    tokens,\n",
    "    batch_mapper=get_batch_similarities,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "similarities = np.array(similarities)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = similarities[:len(tokens_train)]\n",
    "X_test = similarities[len(tokens_train):]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print('X_train:', X_train.shape)\n",
    "print('X_test: ', X_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feature_names = [\n",
    "    'wordnet_similarity_raw',\n",
    "    'wordnet_similarity_brown',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "project.save_features(X_train, X_test, feature_names, feature_list_id)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
